@article{luo2020univilm,
  title={Univilm: A unified video and language pre-training model for multimodal understanding and generation},
  author={Luo, Huaishao and Ji, Lei and Shi, Botian and Huang, Haoyang and Duan, Nan and Li, Tianrui and Chen, Xilin and Zhou, Ming},
  journal={arXiv preprint arXiv:2002.06353},
  year={2020},
  url={https://arxiv.org/pdf/2002.06353.pdf}
}